library(readr)
## Warning: package 'readr' was built under R version 3.3.2
library(fpp)
## Loading required package: forecast
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.3
## Loading required package: fma
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.3.2
## Loading required package: expsmooth
## Loading required package: lmtest
library(TTR)
Final_Travel <- read.csv("/Users/Srinivas/Desktop/Final_Travel.csv")
travel<- Final_Travel$Value
#plot(travel,type = "l")
travel_ts <- ts(travel,start = c(1999,1),frequency = 12)
plot(travel_ts)

#Time series plot of International travel to United States between the year 1999 to 2016.
#We can see that from time series that there seems to be seasonal variation in the number of visitors every year.
#There is peak every alternate year.It can be described using an additive model,as the seasonal fluctuations are roughly constant in size over time and do not seem to depend on the level of the time series,
#and the random fluctuations also seem to be roughly constant in size over time.There is significant drop towards the end of 2001 because of the 9/11 attack
travel_ts_2 <- window(travel_ts,start = c(2002,1))
plot(travel_ts_2)

#the plot is from 2002 to 2016 removing the drop which occured due to 9/11 incident
summary(travel_ts_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1197000 1735000 2056000 2206000 2569000 4075000
boxplot(travel_ts_2)

##The box plot indicates that the number of International Travel is between 1197000 and 2569000 with an median of 2056000.The number of tourists visits varies in this range.Also the mean and
#median are not much separated which gives an idea that roughly around 50% of values are less than mean and 50% values are greater than mean.
#There are also some outliers beyond 4075000 which suggests that there are some values in the dataset that do no fit in the possible range of datapoints.
travel_decomp <- stl(travel_ts_2,s.window = "periodic")
plot(travel_decomp)

#Yes the time Series is seasonal
summary(travel_decomp)
## Call:
## stl(x = travel_ts_2, s.window = "periodic")
##
## Time.series components:
## seasonal trend remainder
## Min. :-515059.7 Min. :1477540 Min. :-276320.1
## 1st Qu.:-197467.9 1st Qu.:1795731 1st Qu.: -75341.8
## Median : 31207.8 Median :2083809 Median : -6196.6
## Mean : -382.5 Mean :2207176 Mean : -551.4
## 3rd Qu.: 138883.7 3rd Qu.:2556364 3rd Qu.: 67255.9
## Max. : 521692.5 Max. :3241703 Max. : 322003.0
## IQR:
## STL.seasonal STL.trend STL.remainder data
## 336352 760633 142598 834078
## % 40.3 91.2 17.1 100.0
##
## Weights: all == 1
##
## Other components: List of 5
## $ win : Named num [1:3] 1761 19 13
## $ deg : Named int [1:3] 0 1 1
## $ jump : Named num [1:3] 177 2 2
## $ inner: int 2
## $ outer: int 0
travel_decomp_1 <-decompose(travel_ts_2)
travel_decomp_1$type
## [1] "additive"
#Time series is additive in nature. At a particular point in time series the value is the summation of all the three components of the time series.
seasonal_indices <- travel_decomp_1$seasonal
seasonal_indices
## Jan Feb Mar Apr May Jun
## 2002 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2003 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2004 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2005 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2006 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2007 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2008 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2009 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2010 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2011 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2012 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2013 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2014 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2015 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## 2016 -414382.74 -522276.94 -173650.06 -48804.42 30903.31 76409.67
## Jul Aug Sep Oct Nov Dec
## 2002 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2003 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2004 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2005 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2006 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2007 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2008 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2009 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2010 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2011 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2012 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2013 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2014 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2015 503879.35 470083.42 171812.04 130449.21 -269600.93 45178.10
## 2016 503879.35 470083.42
barplot(seasonal_indices[1:12],xlab="Months",ylab = "Number of Visitors",axisnames = TRUE)

max_indices <- which(seasonal_indices==max(seasonal_indices))
max_indices
## [1] 7 19 31 43 55 67 79 91 103 115 127 139 151 163 175
# July
min_indices <- which(seasonal_indices==min(seasonal_indices))
min_indices
## [1] 2 14 26 38 50 62 74 86 98 110 122 134 146 158 170
# February
#For the month of July time series value is high and for the month of February the value of time series is low.
#generally during the month of July ,The climate is much pleasant compared to other months which may be a reason for people to travel .During that time lot of International students come to US for Education which may be another reason for the value being high during that month.
plot(travel_ts_2)
lines(seasadj(travel_decomp_1),col="red")

# Yes seasonality has a major impact on the time series.Plot for seasonal adjusted time series is almost a straight line with a positive slope.
# Naive
naive_forecast <- naive(travel_ts_2,12)
naive_forecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3836721 3450382 4223060 3245866 4427576
## Oct 2016 3836721 3450382 4223060 3245866 4427576
## Nov 2016 3836721 3450382 4223060 3245866 4427576
## Dec 2016 3836721 3450382 4223060 3245866 4427576
## Jan 2017 3836721 3450382 4223060 3245866 4427576
## Feb 2017 3836721 3450382 4223060 3245866 4427576
## Mar 2017 3836721 3450382 4223060 3245866 4427576
## Apr 2017 3836721 3450382 4223060 3245866 4427576
## May 2017 3836721 3450382 4223060 3245866 4427576
## Jun 2017 3836721 3450382 4223060 3245866 4427576
## Jul 2017 3836721 3450382 4223060 3245866 4427576
## Aug 2017 3836721 3450382 4223060 3245866 4427576
summary(naive_forecast)
##
## Forecast method: Naive method
##
## Model Information:
## $drift
## [1] 0
##
## $drift.se
## [1] 0
##
## $sd
## [1] 301948.4
##
## $call
## naive(y = travel_ts_2, h = 12)
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 15081.85 301462 240545.6 -0.2643395 11.06543 1.397038
## ACF1
## Training set -0.2024705
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3836721 3450382 4223060 3245866 4427576
## Oct 2016 3836721 3450382 4223060 3245866 4427576
## Nov 2016 3836721 3450382 4223060 3245866 4427576
## Dec 2016 3836721 3450382 4223060 3245866 4427576
## Jan 2017 3836721 3450382 4223060 3245866 4427576
## Feb 2017 3836721 3450382 4223060 3245866 4427576
## Mar 2017 3836721 3450382 4223060 3245866 4427576
## Apr 2017 3836721 3450382 4223060 3245866 4427576
## May 2017 3836721 3450382 4223060 3245866 4427576
## Jun 2017 3836721 3450382 4223060 3245866 4427576
## Jul 2017 3836721 3450382 4223060 3245866 4427576
## Aug 2017 3836721 3450382 4223060 3245866 4427576
plot(naive_forecast)

plot(naive_forecast$residuals,type="p")

#residuals are random over the years.
hist(naive_forecast$residuals)

#It is normally distributed with skewness towards the left
plot(as.numeric(naive_forecast$fitted),as.numeric(naive_forecast$residuals),type="p",xlab="Fitted",ylab="Residuals")

#residuals are random and do not depend on fitted values
plot(as.numeric(naive_forecast$x),as.numeric(naive_forecast$residuals),type="p",xlab="Actual",ylab="Residuals")

#residuals are random and independant from the actual values.
Acf(naive_forecast$residuals)

#There is seasonal variation in the acf and justifies the plot in the original time series
summary(naive_forecast)
##
## Forecast method: Naive method
##
## Model Information:
## $drift
## [1] 0
##
## $drift.se
## [1] 0
##
## $sd
## [1] 301948.4
##
## $call
## naive(y = travel_ts_2, h = 12)
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 15081.85 301462 240545.6 -0.2643395 11.06543 1.397038
## ACF1
## Training set -0.2024705
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3836721 3450382 4223060 3245866 4427576
## Oct 2016 3836721 3450382 4223060 3245866 4427576
## Nov 2016 3836721 3450382 4223060 3245866 4427576
## Dec 2016 3836721 3450382 4223060 3245866 4427576
## Jan 2017 3836721 3450382 4223060 3245866 4427576
## Feb 2017 3836721 3450382 4223060 3245866 4427576
## Mar 2017 3836721 3450382 4223060 3245866 4427576
## Apr 2017 3836721 3450382 4223060 3245866 4427576
## May 2017 3836721 3450382 4223060 3245866 4427576
## Jun 2017 3836721 3450382 4223060 3245866 4427576
## Jul 2017 3836721 3450382 4223060 3245866 4427576
## Aug 2017 3836721 3450382 4223060 3245866 4427576
accuracy(naive_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set 15081.85 301462 240545.6 -0.2643395 11.06543 1.397038
## ACF1
## Training set -0.2024705
#accuracy measure for naive
#seasonalnaive
travel_naive <- snaive(travel_ts_2,h=12)
plot(travel_naive)

plot(travel_naive$residuals,type="p")

#The plot indicates that the residuals are random over the years.
hist(travel_naive$residuals)

#The histogram plot confirms that the residuals are random and follow a normal distribution with mean at 0.
plot(as.numeric(travel_naive$fitted),as.numeric(travel_naive$residuals),type="p",xlab="Fitted",ylab="Residuals")

#The plot shows that the residuals are random and do not depend on fitted values.
plot(as.numeric(travel_naive$x),as.numeric(travel_naive$residuals),type="p",xlab="Actual",ylab="Residuals")

#The plot shows the randomness of residuals and independence from the actual values.
Acf(travel_naive$residuals)

#The Acf plot shows that there is there is residuals at the starting of the time series but is significant later on which implies that residuals are not related to each other.
accuracy(travel_naive)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 115998.2 205853.8 172182.5 4.595344 7.741379 1 0.5732204
summary(travel_naive)
##
## Forecast method: Seasonal naive method
##
## Model Information:
## $drift
## [1] 0
##
## $drift.se
## [1] 0
##
## $sd
## [1] 170580.3
##
## $call
## snaive(y = travel_ts_2, h = 12)
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 115998.2 205853.8 172182.5 4.595344 7.741379 1 0.5732204
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3551833 3288021 3815645 3148367 3955299
## Oct 2016 3500510 3236698 3764322 3097044 3903976
## Nov 2016 2822990 2559178 3086802 2419524 3226456
## Dec 2016 3145903 2882091 3409715 2742437 3549369
## Jan 2017 2645349 2381537 2909161 2241883 3048815
## Feb 2017 2405849 2142037 2669661 2002383 2809315
## Mar 2017 2897042 2633230 3160854 2493576 3300508
## Apr 2017 2895487 2631675 3159299 2492021 3298953
## May 2017 3213869 2950057 3477681 2810403 3617335
## Jun 2017 3293795 3029983 3557607 2890329 3697261
## Jul 2017 3931057 3667245 4194869 3527591 4334523
## Aug 2017 3836721 3572909 4100533 3433255 4240187
plot(travel_naive)

#By comparing the RMSE between naive and seasonal naive .The RMSE is better in seasonal and has good accuracy.
#The prediction for the next year is almost same as the previous year values,due to some fluctuations the plot shows that 95% times the values will be in shaded blue region.
# Simple moving average
plot(travel_ts_2,ylab="Number of Visitors")
travel_ma3 <- ma(travel_ts_2,order = 3)
lines(travel_ma3,col="red")
travel_ma6 <- ma(travel_ts_2,order = 6)
lines(travel_ma6,col="blue")
travel_ma12 <- ma(travel_ts_2,order = 12)
lines(travel_ma12,col="green")

#The choice of order for moving average is as inferred from the plot above as we go on increasing the order we are actually removing the seasonality. One can easily compare the above plot where green line is almost same as the plot for seasonal adjusted time series.
travel_ma1 <- ma(travel_ts_2,order = 1)
forecast_ma<-forecast(travel_ma1,h=12)
plot(forecast_ma)

# Simple Smoothing
travel_ss <- ets(travel_ts_2)
#ss_summary <- travel_ss$par
forecast.ets(travel_ss)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3437539 3231023 3644055 3121700 3753378
## Oct 2016 3365511 3146587 3584435 3030695 3700327
## Nov 2016 2803153 2607859 2998448 2504476 3101831
## Dec 2016 3257644 3016566 3498721 2888948 3626340
## Jan 2017 2572956 2372011 2773901 2265637 2880275
## Feb 2017 2452432 2251354 2653511 2144909 2759955
## Mar 2017 2936402 2684727 3188077 2551498 3321306
## Apr 2017 3087290 2811679 3362901 2665780 3508800
## May 2017 3201511 2904728 3498294 2747620 3655401
## Jun 2017 3267131 2953460 3580802 2787413 3746850
## Jul 2017 3894858 3508463 4281252 3303918 4485797
## Aug 2017 3839291 3446509 4232072 3238584 4439998
## Sep 2017 3437539 3075510 3799569 2883863 3991216
## Oct 2017 3365512 3001218 3729805 2808372 3922651
## Nov 2017 2803154 2491740 3114567 2326888 3279419
## Dec 2017 3257644 2886682 3628607 2690306 3824982
## Jan 2018 2572956 2272976 2872936 2114177 3031736
## Feb 2018 2452433 2159995 2744870 2005188 2899677
## Mar 2018 2936402 2578627 3294177 2389232 3483572
## Apr 2018 3087290 2703276 3471305 2499991 3674590
## May 2018 3201511 2795304 3607718 2580271 3822751
## Jun 2018 3267131 2844604 3689659 2620931 3913331
## Jul 2018 3894858 3381792 4407924 3110191 4679525
## Aug 2018 3839291 3324483 4354099 3051959 4626623
summary(travel_ss)
## ETS(M,N,M)
##
## Call:
## ets(y = travel_ts_2)
##
## Smoothing parameters:
## alpha = 0.4147
## gamma = 1e-04
##
## Initial states:
## l = 1615326.4368
## s=1.0256 0.8825 1.0596 1.0823 1.2087 1.2262
## 1.0286 1.0079 0.972 0.9245 0.7721 0.81
##
## sigma: 0.0469
##
## AIC AICc BIC
## 4988.083 4991.083 5035.640
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 21543.29 96382.89 72295.69 0.756152 3.456318 0.4198782
## ACF1
## Training set 0.02090026
#alpha = 0.4147 .Equal weights are given to the previous value to determine the forecasted value.
#sigma: 0.0469 signifies that residuals have a standard deviation of this value.
#Initial states:
#l = 1615326.4368
#s=1.0256 0.8825 1.0596 1.0823 1.2087 1.2262
# 1.0286 1.0079 0.972 0.9245 0.7721 0.81
#ss_summary <- travel_ss$par
plot(travel_ss$residuals,type="p",ylab="Residuals")

#Residuals are random and distributed
hist(travel_ss$residuals,xlab = "Residuals",main="Histogram of Residuals")

#The plot confirms that the residuals are random with a normal distribution with a mean of 0.
plot(as.numeric(travel_ss$fitted),travel_ss$residuals,type="p",xlab="Fitted",ylab="Residuals")

#residuals are independant of fitted values
plot(as.numeric(travel_ss$x),travel_ss$residuals,type="p",xlab="Actual",ylab="Residuals")

#residuals are random and are independant of actual values
Acf(travel_ss$residuals)

#Acf shows spike at 6 .residuals are not significant.
accuracy(travel_ss)
## ME RMSE MAE MPE MAPE MASE
## Training set 21543.29 96382.89 72295.69 0.756152 3.456318 0.4198782
## ACF1
## Training set 0.02090026
forecast_ss <- forecast.ets(travel_ss,h=12)
forecast_ss
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3437539 3231023 3644055 3121700 3753378
## Oct 2016 3365511 3146587 3584435 3030695 3700327
## Nov 2016 2803153 2607859 2998448 2504476 3101831
## Dec 2016 3257644 3016566 3498721 2888948 3626340
## Jan 2017 2572956 2372011 2773901 2265637 2880275
## Feb 2017 2452432 2251354 2653511 2144909 2759955
## Mar 2017 2936402 2684727 3188077 2551498 3321306
## Apr 2017 3087290 2811679 3362901 2665780 3508800
## May 2017 3201511 2904728 3498294 2747620 3655401
## Jun 2017 3267131 2953460 3580802 2787413 3746850
## Jul 2017 3894858 3508463 4281252 3303918 4485797
## Aug 2017 3839291 3446509 4232072 3238584 4439998
plot(forecast_ss)

accuracy(forecast_ss)
## ME RMSE MAE MPE MAPE MASE
## Training set 21543.29 96382.89 72295.69 0.756152 3.456318 0.4198782
## ACF1
## Training set 0.02090026
#MAPE(mean absolute percentage error) is 3.4% which is very low and hence has an accuracy of 96.6% which is good and can be used to forecast values for the next year.
#The predicted values from September 2016 to August 2017 is repetitive of the previous period. The residuals are very less and the number of visitors will be same as the previous year as the model takes into account more recent values. Also the model shows the area and confirms that the 95% of values will lie in the shaded region.
#HoltWinters
travel_holts <- HoltWinters(travel_ts_2)
travel_holts$alpha
## alpha
## 0.3272698
#The Value of alpha signifies that more weight very recent observations in the time series.
travel_holts$beta
## beta
## 0.02480278
#The value of Beta indicates there is not much trend.
travel_holts$gamma
## gamma
## 0.7684698
#Gamma is seasonal component of forecast.The higher the value the most recent seasonal component is weighed
travel_holts$coefficients
## a b s1 s2 s3 s4
## 3009108.503 9420.709 339813.279 249777.147 -387797.749 1381.222
## s5 s6 s7 s8 s9 s10
## -465808.327 -668332.884 -149157.471 -15436.641 313995.641 314927.905
## s11 s12
## 929179.097 824051.837
#From the values of alpha, beta and gamma it is clear that for our forecast recent values are weighed more heavily.
plot(residuals(travel_holts),type="p",ylab="Residuals")

#There is a constant variance in the plot
hist(residuals(travel_holts),xlab = "Residuals",main="Histogram of Residuals")

#The histogram is normally distributed
Acf(residuals(travel_holts))

#there is spike in 6 and 12 .The residuals are not significant.
plot(as.numeric(travel_holts$fitted[,1]),as.numeric(residuals(travel_holts)),type="p",
xlab="Fitted",ylab="Residuals")

#There are no patterns found
#plot(as.numeric(travel_holts$x[13:164]),as.numeric(residuals(travel_holts)),type="p",xlab="Actual",ylab="Residuals")
forecast_holts <- forecast.HoltWinters(travel_holts,h=12)
summary(forecast_holts)
##
## Forecast method: HoltWinters
##
## Model Information:
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = travel_ts_2)
##
## Smoothing parameters:
## alpha: 0.3272698
## beta : 0.02480278
## gamma: 0.7684698
##
## Coefficients:
## [,1]
## a 3009108.503
## b 9420.709
## s1 339813.279
## s2 249777.147
## s3 -387797.749
## s4 1381.222
## s5 -465808.327
## s6 -668332.884
## s7 -149157.471
## s8 -15436.641
## s9 313995.641
## s10 314927.905
## s11 929179.097
## s12 824051.837
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 15175.03 123247.5 95446.26 0.4461066 4.438431 0.5543318
## ACF1
## Training set 0.20383
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3358342 3201116 3515569 3117886 3598799
## Oct 2016 3277727 3111894 3443560 3024107 3531347
## Nov 2016 2649573 2475167 2823979 2382841 2916304
## Dec 2016 3048173 2865214 3231131 2768361 3327984
## Jan 2017 2590404 2398902 2781906 2297527 2883281
## Feb 2017 2397300 2197255 2597345 2091357 2703242
## Mar 2017 2925896 2717300 3134492 2606876 3244916
## Apr 2017 3069038 2851876 3286199 2736917 3401158
## May 2017 3407891 3182143 3633638 3062640 3753141
## Jun 2017 3418243 3183886 3652601 3059825 3776662
## Jul 2017 4041915 3798919 4284912 3670284 4413547
## Aug 2017 3946209 3694541 4197877 3561316 4331102
#The accuracy is good
plot(forecast_holts)

accuracy(forecast_holts)
## ME RMSE MAE MPE MAPE MASE
## Training set 15175.03 123247.5 95446.26 0.4461066 4.438431 0.5543318
## ACF1
## Training set 0.20383
#Predicted Values in one year
forecast_holts
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 3358342 3201116 3515569 3117886 3598799
## Oct 2016 3277727 3111894 3443560 3024107 3531347
## Nov 2016 2649573 2475167 2823979 2382841 2916304
## Dec 2016 3048173 2865214 3231131 2768361 3327984
## Jan 2017 2590404 2398902 2781906 2297527 2883281
## Feb 2017 2397300 2197255 2597345 2091357 2703242
## Mar 2017 2925896 2717300 3134492 2606876 3244916
## Apr 2017 3069038 2851876 3286199 2736917 3401158
## May 2017 3407891 3182143 3633638 3062640 3753141
## Jun 2017 3418243 3183886 3652601 3059825 3776662
## Jul 2017 4041915 3798919 4284912 3670284 4413547
## Aug 2017 3946209 3694541 4197877 3561316 4331102
# ARIMA
# No the time series is not stationary. It can be easily viewed by seeing the original
# time series. However we will confirm by following tests.
adf.test(travel_ts_2)
## Warning in adf.test(travel_ts_2): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: travel_ts_2
## Dickey-Fuller = -11.032, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
# p value is < 0.05 and hence no differencing required ,data is not stationary.
kpss.test(travel_ts_2)
## Warning in kpss.test(travel_ts_2): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: travel_ts_2
## KPSS Level = 3.5156, Truncation lag parameter = 3, p-value = 0.01
# p value is <0.05 and hence differencing required
# test confirms that time series is not stationary and hence require differecing.
nsdiffs(travel_ts_2)
## [1] 1
# 1 differencing required
tsdisplay(travel_ts_2)

#The plot is seasonal .
fit <- stl(travel_ts_2, s.window=5)
plot(fit)

plot(seasadj(fit))

#diffrencing function
travel_ts_diff1 <- diff(travel_ts_2, differences=1)
tsdisplay(travel_ts_diff1)

ndiffs(travel_ts_diff1)
## [1] 0
adf.test(travel_ts_diff1)
## Warning in adf.test(travel_ts_diff1): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: travel_ts_diff1
## Dickey-Fuller = -12.417, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
#Hence we don't need any more differences.
#The data is non-stationary now
Acf(travel_ts_diff1, lag.max=20)

Pacf(travel_ts_diff1, lag.max=20)

#auto.arima(travel_ts)
#Based on the ACF and PACF plots we take ARIMA models as (0,1,1) and (2,1,2)
auto.fit1<-Arima(travel_ts_diff1,c(0,1,1))
auto.fit2<-Arima(travel_ts_diff1,c(2,1,2))
summary(auto.fit1)
## Series: travel_ts_diff1
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -1.0000
## s.e. 0.0149
##
## sigma^2 estimated as 9.17e+10: log likelihood=-2445.01
## AIC=4894.02 AICc=4894.09 BIC=4900.34
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1806.277 301084.5 241066.8 1320.514 1324.737 2.167629
## ACF1
## Training set -0.2016917
summary(auto.fit2)
## Series: travel_ts_diff1
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -1.0038 -0.2462 -0.1932 -0.8068
## s.e. 0.1045 0.0738 0.0809 0.0804
##
## sigma^2 estimated as 8.745e+10: log likelihood=-2439.64
## AIC=4889.28 AICc=4889.64 BIC=4905.07
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1627.196 291459.6 237877 -7811.666 8148.279 2.138947
## ACF1
## Training set 0.028517
# AIC
auto.fit1$aic
## [1] 4894.023
auto.fit2$aic
## [1] 4889.28
# BIC
auto.fit1$bic
## [1] 4900.341
auto.fit2$bic
## [1] 4905.075
# Sigma^2
auto.fit1$sigma2
## [1] 91699891350
auto.fit2$sigma2
## [1] 87447205782
#From above observations we find that auto.fit2 model is better based on the AIC and BIC values.
#So, we consider ARIMA(2,1,2) model.
#The fitted model formula is: (1 + 1.0038B)(1 + 0.2462B^2) X_t = (1 - 0.1932B)(1 - 0.8068B^2)
#Residual Analysis
#Plot of residuals
plot(auto.fit2$residuals)

#Resdiuals do not follow a particular pattern.
#Histogram of residuals
hist(auto.fit2$residuals)

#The histogram is normally distributed
#Plot of actual vs residuals
plot(as.numeric(travel_ts_diff1), as.numeric(auto.fit2$residuals),type="p",xlab= "Actual",ylab="Residuals")

#There are patterns found
#Acf plot of residuals
Acf(auto.fit2$residuals)

#There is autocorrelation and so the forecasting technique is not so good
travel_forecast_arima<-forecast.Arima(auto.fit2,h=12)
accuracy(travel_forecast_arima)
## ME RMSE MAE MPE MAPE MASE
## Training set -1627.196 291459.6 237877 -7811.666 8148.279 2.138947
## ACF1
## Training set 0.028517
#Forecast
travel_forecast_arima
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 -31353.579 -411410.8 348703.7 -612601.0 549893.8
## Oct 2016 88120.488 -298820.2 475061.1 -503654.2 679895.2
## Nov 2016 -47310.245 -434607.3 339986.8 -639630.0 545009.5
## Dec 2016 59211.756 -329998.2 448421.7 -536033.5 654457.0
## Jan 2017 -14363.984 -404778.2 376050.2 -611451.0 582723.0
## Feb 2017 33259.779 -357965.0 424484.5 -565066.8 631586.4
## Mar 2017 3573.294 -387892.4 395039.0 -595121.8 602268.4
## Apr 2017 21645.142 -369988.8 413279.0 -577307.2 620597.5
## May 2017 10814.958 -380838.9 402468.8 -588167.9 609797.8
## Jun 2017 17236.087 -374451.1 408923.3 -581797.8 616269.9
## Jul 2017 13457.498 -378226.0 405141.0 -585570.8 612485.8
## Aug 2017 15669.240 -376022.1 407360.6 -583371.0 614709.5
plot(travel_forecast_arima)

#Summarize the forecasting technique
summary(travel_forecast_arima)
##
## Forecast method: ARIMA(2,1,2)
##
## Model Information:
## Series: travel_ts_diff1
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -1.0038 -0.2462 -0.1932 -0.8068
## s.e. 0.1045 0.0738 0.0809 0.0804
##
## sigma^2 estimated as 8.745e+10: log likelihood=-2439.64
## AIC=4889.28 AICc=4889.64 BIC=4905.07
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1627.196 291459.6 237877 -7811.666 8148.279 2.138947
## ACF1
## Training set 0.028517
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Sep 2016 -31353.579 -411410.8 348703.7 -612601.0 549893.8
## Oct 2016 88120.488 -298820.2 475061.1 -503654.2 679895.2
## Nov 2016 -47310.245 -434607.3 339986.8 -639630.0 545009.5
## Dec 2016 59211.756 -329998.2 448421.7 -536033.5 654457.0
## Jan 2017 -14363.984 -404778.2 376050.2 -611451.0 582723.0
## Feb 2017 33259.779 -357965.0 424484.5 -565066.8 631586.4
## Mar 2017 3573.294 -387892.4 395039.0 -595121.8 602268.4
## Apr 2017 21645.142 -369988.8 413279.0 -577307.2 620597.5
## May 2017 10814.958 -380838.9 402468.8 -588167.9 609797.8
## Jun 2017 17236.087 -374451.1 408923.3 -581797.8 616269.9
## Jul 2017 13457.498 -378226.0 405141.0 -585570.8 612485.8
## Aug 2017 15669.240 -376022.1 407360.6 -583371.0 614709.5
#The accuracy is not that good compared to other methods.
#Accuracy Summary
#Naive
accuracy(naive_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set 15081.85 301462 240545.6 -0.2643395 11.06543 1.397038
## ACF1
## Training set -0.2024705
#Seasonal Naive
accuracy(travel_naive)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 115998.2 205853.8 172182.5 4.595344 7.741379 1 0.5732204
#Simple Smoothing
accuracy(forecast_ss)
## ME RMSE MAE MPE MAPE MASE
## Training set 21543.29 96382.89 72295.69 0.756152 3.456318 0.4198782
## ACF1
## Training set 0.02090026
#HoltWInters
accuracy(forecast_holts)
## ME RMSE MAE MPE MAPE MASE
## Training set 15175.03 123247.5 95446.26 0.4461066 4.438431 0.5543318
## ACF1
## Training set 0.20383
#ARIMA
accuracy(travel_forecast_arima)
## ME RMSE MAE MPE MAPE MASE
## Training set -1627.196 291459.6 237877 -7811.666 8148.279 2.138947
## ACF1
## Training set 0.028517
#Definition
#(Simple Smoothing)
#Time series that can be described using an additive model with constant level and no seasonality, you can use simple exponential smoothing to make short-term forecasts.The simple exponential smoothing method provides a way of estimating the level at the current time point.
#Smoothing is controlled by the parameter alpha; for the estimate of the level at the current time point.
#Holtwinters
# IF you have a time series that can be described using an additive model with increasing or decreasing trend and seasonality, you can use Holt-Winters exponential smoothing to make short-term forecasts.
#Holt-Winters exponential smoothing estimates the level, slope and seasonal component at the current time point. Smoothing is controlled by three parameters: alpha, beta, and gamma, for the estimates of the level
#Arima
#While exponential smoothing methods do not make any assumptions about correlations between successive values of the time series, in some cases you can make a better predictive model by taking correlations in the data into account.
#Autoregressive Integrated Moving Average (ARIMA) models include an explicit statistical model for the irregular component of a time series, that allows for non-zero autocorrelations in the irregular component.
#NaiveForcasting
#Estimating technique in which the last period' actuals are used as this period's forecast without adjusting them.It is used only for comparison with forecasts generated by the better techniques.
#The worst method is Naive
#Ranking the forecasting Methods considering the RMSE(root mean square error)
#1 Simple Smoothing
#2 Holt Winters
#3 Seasonal Naive
#4 Arima
#5 Naive
#Conculusion
#We can see that the data has an upward trend but also seasonality is present
#Since the trend is upward, the value of time series will increase in the next year
#IF were you ,I would have given a A grade for the following reasons.
#1 I have never missed your class
#2 I got the concepts clear , i was also able to succesfully implement Forecasting methods for my project in another Course.
#3 I believe i did active participation in class ,i did take more effort to practice after class until i got the concepts clear.